library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(readxl)
library(here)
## here() starts at C:/Users/Malcolm/Documents/Code/GitHub/dirty_data_codeclan_project_mcheyne/Task 3 Sea bird observation data

Reading in the data

clean_data <- read_csv(here("clean_data/seabirds_cleaned_data.csv"))
## Rows: 49020 Columns: 52
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (19): common_name, scientific_name, species_abbreviation, age, plphase,...
## dbl  (28): record_x, record_id, wanplum, total_sighting, num_feeding, num_on...
## lgl   (3): sex, air_temp, salinity
## dttm  (2): date, time
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
clean_data

Breif

Clean the data for from the Sea bird observation to answer the questions

Assumptions

Took out any recorded with “NO BIRDS RECORDED”

Took out sensu, lato (unidentified) or upper case letters at the end of common_name as these where not part of the name buy descriptions of the birds.

Have added Ext to the questions as not sure your definition of type and species ie Royal / Wandering albatross and Black-browed albatross are different birds or as there all the same species ie just count all as just albatross. My first answer is with the former, the Ext parts have them as the latter.

Cleanning steps

Use a script file to clean

full_join() the 2 data sheets so not to loses any data

clean_names()

Renamed the columns for easier reading

recode() the data for easier reading

Removed descriptions of the birds form the common_name

Write to a .csv file

Selecting the relevant data

seabirds <- clean_data %>% 
              select(record_id, common_name, scientific_name, 
                     species_abbreviation, total_sighting, num_group_sighting, 
                     lat)
seabirds

Questions

Q1 Which bird had the most individual sightings?

seabirds %>% 
  group_by(common_name) %>% 
  summarise(count = sum(n())) %>% 
  arrange(desc(count))
# Wandering albatross   11293

Ext Q1 combining all the albatross as one type

seabirds %>% 
  group_by(common_name) %>%
  filter(str_detect(common_name, "(?i)albatross")) %>% 
  ungroup() %>% 
  summarise(count = sum(total_sighting, na.rm = TRUE))
# All albatross 30424   

Q2 Which bird had the highest total count?

seabirds %>% 
  group_by(common_name) %>%
  summarise(count = sum(total_sighting, na.rm = TRUE)) %>% 
  arrange(desc(count))
# Short-tailed shearwater   982553  

Ext Q2 combining all the shearwater as one type

seabirds %>% 
  group_by(common_name) %>%
  filter(str_detect(common_name, "(?i)shearwater")) %>% 
  ungroup() %>% 
  summarise(count = sum(total_sighting, na.rm = TRUE))
# All shearwater 1394468        

Q3 Which bird had the highest total count above a latitude of -30?

seabirds %>% 
  group_by(common_name) %>%
  filter(lat > -30) %>%
  summarise(count = sum(total_sighting, na.rm = TRUE)) %>%
  arrange(desc(count))
# Taking above a latitude of -30 as nearer the equator ie 0
# Wedge-tailed shearwater   855 

Ext Q3 combining all the shearwater as one type

seabirds %>% 
  group_by(common_name) %>%
  filter(str_detect(common_name, "(?i)shearwater")) %>% 
  filter(lat > -30) %>%
  ungroup() %>% 
  summarise(count = sum(total_sighting, na.rm = TRUE))
# All shearwater seen above a latitude of -30 (nearer the the equator ie 0) 888     

Q4 How many different types of birds were only ever seen in groups of 1?

seabirds %>% 
  group_by(common_name) %>% 
  mutate(max_sighting = max(num_group_sighting, na.rm = TRUE)) %>% 
  filter(max_sighting == 1) %>%
  distinct(common_name) 
## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf

## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf

## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf

## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf

## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf

## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf
# 23 birds only seen once

Ext Q4 If by types of birds you mean more individual species with no variations

seabirds %>% 
  group_by(common_name) %>% 
  mutate(common_name = if_else(str_detect(common_name, 
                                          "(?i)shearwater"),"shearwater", 
                                                            common_name),
         common_name = if_else(str_detect(common_name, 
                                          "(?i)albatross"), "albatross",
                                                            common_name),
         common_name = if_else(str_detect(common_name, 
                                          "(?i)mollymawk"), "mollymawk",
                                                            common_name),
         common_name = if_else(str_detect(common_name, 
                                          "(?i)petrel"), "petrel",
                                                            common_name),
         common_name = if_else(str_detect(common_name, 
                                          "(?i)prion"), "prion",
                                                            common_name),
         common_name = if_else(str_detect(common_name, 
                                          "(?i)skua"), "skua",
                                                            common_name)
         ) %>% 
    mutate(max_sighting = max(num_group_sighting, na.rm = TRUE)) %>% 
  filter(max_sighting == 1) %>%
  distinct(common_name) 
## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf

## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf

## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf

## Warning in max(num_group_sighting, na.rm = TRUE): no non-missing arguments to
## max; returning -Inf
# 2 individual species of birds only seen once, 
# when the variations are combined as one group

Q5 How many penguins were seen? (Hint: there are many types of penguin)

seabirds %>% 
  group_by(common_name) %>%
  filter(str_detect(common_name, "(?i)penguin")) %>% 
  ungroup() %>% 
  summarise(count = sum(total_sighting, na.rm = TRUE))
# 158   penguins